import numpy as np # ndarry 다차원 배열 (vector1차원, matrix2차원, tensor= 3차원)
import scipy # 과학계산용 패키지 ( optimization, interpolation ( 보간법) )
import sklearn.metrics.pairwise
# accuracy_score, score
# norm은 원점에서의 거리값
a_64 = np.array([61.22, 71.60, -65.755], dtype= np.float64)
b_64 = np.array([61.22, 71.608, -65.72], dtype=np.float64)
a_32 = a_64.astype(np.float32)
b_32 = b_64.astype(np.float32)
dist_64_np = np.array([np.linalg.norm(a_64 - b_64)], dtype=np.float64) # 차이벡터
dist_32_np = np.array([np.linalg.norm(a_32 - b_32)], dtype=np.float32)
dist_64_sklearn = sklearn.metrics.pairwise.pairwise_distances([a_64],[b_32], metric = "cityblock")
dist_64_sklearn = sklearn.metrics.pairwise.pairwise_distances([a_64],[b_64])
from sklearn.metrics.pairwise import euclidean_distances
print("euclidean_distances", euclidean_distances([a_64],[b_64]))
euclidean_distances [[0.03590265]]
from sklearn import cluster, datasets
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
iris = datasets.load_iris() # 4개, 종속 1
X = iris.data[:, :2] # 전체데이터 150개 ( 3개종 1종당 50개의 데이터를 관찰 ) 150x2
y_iris = iris.target
km2 = cluster.KMeans(n_clusters=2).fit(X)
km3 = cluster.KMeans(n_clusters=3).fit(X)
km4 = cluster.KMeans(n_clusters=4).fit(X)
plt.figure(figsize=(9,3))
plt.subplot(131) # 1행3열에 1번째
plt.scatter(X[:,0], X[:, 1], c=km2.labels_) # 모델
# 작아야 응집도 최대화되는
plt.title("K=2, J=%.2f" % km2.inertia_) # 응집도 ( 중심에서의 거리의 합)
plt.subplot(132)
plt.scatter(X[:,0], X[:,1], c=km3.labels_)
plt.title("K=3, J=%.2f" % km3.inertia_)
plt.subplot(133)
plt.scatter(X[:,0], X[:, 1], c=km4.labels_)
plt.title("K=4, J=%.2f" % km4.inertia_)
km4.n_iter_
km4.cluster_centers_
array([[5.90980392, 2.73529412],
[6.85348837, 3.1 ],
[4.76923077, 2.93076923],
[5.21333333, 3.65333333]])
km2.labels_
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0,
1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0,
1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1])
km4.n_iter_ # iteration
6
km4.cluster_centers_ # 그룹의 특성을 이해
array([[5.90980392, 2.73529412],
[6.85348837, 3.1 ],
[4.76923077, 2.93076923],
[5.21333333, 3.65333333]])
# clustering 이후에 3차원으로 시각화
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from sklearn import datasets
np.random.seed(5)
centers = [[1,1], [-1, -1], [1,-1]]
iris = datasets.load_iris()
X = iris.data
y = iris.target
estimators = {'_3': KMeans(n_clusters=3),
'_8': KMeans(n_clusters=8)}
fignum = 1
for name, est in estimators.items():
fig = plt.figure(fignum)
plt.clf() # clear entire current figure 초기화
# elev = 카메라 상하각, azim = 좌우각
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim= 134)
plt.cla() # 축값을 지움
est.fit(X) # 그룹학습
labels = est.labels_
# 0: Sepal length, 1: Sepal width, 2: Petal length, 3: Petal width
ax.scatter(X[:, 3], X[:, 0], X[:, 2],
c = labels.astype(np.float), s= 100)
ax.w_xaxis.set_ticklabels([]) # 레이블 X
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_xlabel('Petal width')
ax.set_ylabel('Sepal length')
ax.set_zlabel('Petal length')
plt.title(name)
fignum = fignum + 1
plt.show()
<ipython-input-19-b97f5529ebd5>:20: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations c = labels.astype(np.float), s= 100) <ipython-input-19-b97f5529ebd5>:20: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations c = labels.astype(np.float), s= 100)
from sklearn.datasets import load_digits
from sklearn.preprocessing import scale
# kmeans와 scale의 관계 : 중요 - 사이즈에 따라 거리값이 불공평
import matplotlib.pyplot as plt
digits = load_digits()
data = scale( digits. data) # Z점수 표준화
def print_digits(images, labels):
f =plt.figure(figsize=(10,2))
plt.subplots_adjust(top=1, bottom=0, hspace=0, wspace=0.05)
i = 0
while (i < 10 and i < images.shape[0]): # 10장까지만, 장수 10장이 안되면
ax = f.add_subplot(1,10, i+1) # 이미지 출력은 1부터
ax.imshow(images[i], cmap= plt.cm.bone)
ax.grid(False)
ax.table
ax.set_title(labels[i])
ax.xaxis.set_ticks([])
ax.yaxis.set_ticks([])
plt.tight_layout()
i += 1
print_digits(digits.images, range(10))
digits.data.shape # 1797장
(1797, 64)
digits.images.shape # 이미지로 찍을 때 차원
(1797, 8, 8)
digits.images[1]
array([[ 0., 0., 0., 12., 13., 5., 0., 0.],
[ 0., 0., 0., 11., 16., 9., 0., 0.],
[ 0., 0., 3., 15., 16., 6., 0., 0.],
[ 0., 7., 15., 16., 16., 2., 0., 0.],
[ 0., 0., 1., 16., 16., 3., 0., 0.],
[ 0., 0., 1., 16., 16., 6., 0., 0.],
[ 0., 0., 1., 16., 16., 6., 0., 0.],
[ 0., 0., 0., 11., 16., 10., 0., 0.]])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test, images_train, images_test \
= train_test_split(
data, digits.target, digits.images, test_size=0.25, random_state=42)
# 클러스터링을 하면 라벨이 결정 => 지도학습한 것과 일치
# 중심 10개 생성 -> 새로운 데이터 들어 오면
from sklearn.cluster import KMeans
# 0~9 : 분류
clf = KMeans(init="k-means++", n_clusters=10, random_state=42)
clf.fit(X_train)
print_digits(images_train, clf.labels_)
y_pred = clf.predict(X_test)
def print_cluster(images, y_pred, cluster_number):
images = images[y_pred == cluster_number]
y_pred = y_pred[y_pred == cluster_number]
print_digits(images, y_pred) # 클러스터당 10장씩 출력
for i in range(10):
print_cluster(images_test, y_pred, i )
<Figure size 720x144 with 0 Axes>
from sklearn.datasets.samples_generator import make_blobs
X, y =make_blobs(n_samples=100, centers=5, random_state=101)
C:\Users\1\anaconda3\lib\site-packages\sklearn\utils\deprecation.py:143: FutureWarning: The sklearn.datasets.samples_generator module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.datasets. Anything that cannot be imported from sklearn.datasets is now part of the private API. warnings.warn(message, FutureWarning)
import seaborn as sns
from sklearn.cluster import KMeans
elbow = []
for i in range(1, 20):
kmeans = KMeans(n_clusters = i, init = 'k-means++',
random_state=101)
kmeans.fit(X)
elbow.append(kmeans.inertia_)
sns.lineplot(range(1,20), elbow,color='blue')
plt.rcParams.update({'figure.figsize':(10, 7.5), 'figure.dpi': 100})
plt.title('elbow method')
plt.show()
C:\Users\1\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
from imageio import imread
import matplotlib.pyplot as plt
img = imread("garden.jpg")
plt.imshow(img)
img.shape
x,y,z = img.shape
long_img = img.reshape(x*y, z)
long_img.shape
(446400, 3)
from sklearn.datasets import load_sample_image # 이미지 학습
from sklearn.utils import shuffle
n_colors = 16
china = load_sample_image("china.jpg") # 0~255구성
china = np.array(china, dtype = np.float64)/ 255 # 이미지 정규화
w, h ,d = original_shape = tuple(china.shape)
# 컬러이미지는 기본적으로 3차원이다. (8,8)
print(w, h, d) # 427 640 3 # 가로 , 세로 , 컬값
assert d==3 # 컬러이미지를 로딩했는지 확인
image_array = np.reshape(china, (w * h, d)) # 2차원으로 직렬화
print("Kmeans 데이터 피팅")
# 427 * 640
image_array_sample = shuffle(image_array, random_state=0)[:1000]
kmeans = KMeans(n_clusters=n_colors, random_state=0).fit(image_array_sample)
# 결과 중심 : 64
# 라벨링 : 427 * 640
print("(k-means) 예측")
labels = kmeans.predict(image_array) # 427 * 640
# 압축, 이미지를 64Color로 재표현
def recreate_image(codebook, labels, w, h): # 중심값팔레트, 중심값인덱스, 이미지크기
d = codebook.shape[1] # 2차원
image= np.zeros((w,h,d)) # 이미지 사이즈만큼 초기화
label_idx = 0
for i in range(w):
for j in range(h):
image[i][j] = codebook[labels[label_idx]]
label_idx += 1
return image
427 640 3 Kmeans 데이터 피팅 (k-means) 예측
plt.figure(1)
plt.clf()
ax = plt.axes([0,0,1,1])
plt.axis('off')
plt.title('Original 이미지 (96,615 colors)')
plt.imshow(china)
plt.figure(2)
plt.clf()
ax = plt.axes([0,0,1,1])
plt.axis('off')
plt.title('Vector Quantized(VQ) image (64 colors, K-Menas)')
plt.imshow(recreate_image(kmeans.cluster_centers_, labels, w, h))
<matplotlib.image.AxesImage at 0x22fb9ef8ac0>
C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:238: RuntimeWarning: Glyph 51060 missing from current font. font.set_text(s, 0.0, flags=flags) C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:238: RuntimeWarning: Glyph 48120 missing from current font. font.set_text(s, 0.0, flags=flags) C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:238: RuntimeWarning: Glyph 51648 missing from current font. font.set_text(s, 0.0, flags=flags) C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:201: RuntimeWarning: Glyph 51060 missing from current font. font.set_text(s, 0, flags=flags) C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:201: RuntimeWarning: Glyph 48120 missing from current font. font.set_text(s, 0, flags=flags) C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:201: RuntimeWarning: Glyph 51648 missing from current font. font.set_text(s, 0, flags=flags)
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import cluster
image = plt.imread("서현진1.jpg")
plt.figure(figsize=(15,8))
plt.imshow(image)
<matplotlib.image.AxesImage at 0x22fb9f60e20>
from sklearn.datasets import load_sample_image # 이미지 학습
from sklearn.utils import shuffle
n_colors = 16
sh = imread("서현진1.jpg") # 0~255구성
sh = np.array(sh, dtype = np.float64)/ 255 # 이미지 정규화
w, h ,d = original_shape = tuple(sh.shape)
# 컬러이미지는 기본적으로 3차원이다. (8,8)
print(w, h, d) # 427 640 3 # 가로 , 세로 , 컬값
assert d==3 # 컬러이미지를 로딩했는지 확인
image_array = np.reshape(sh, (w * h, d)) # 2차원으로 직렬화
print("Kmeans 데이터 피팅")
# 427 * 640
image_array_sample = shuffle(image_array, random_state=0)[:1000]
kmeans = KMeans(n_clusters=n_colors, random_state=0).fit(image_array_sample)
# 결과 중심 : 64
# 라벨링 : 427 * 640
print("(k-means) 예측")
labels = kmeans.predict(image_array) # 427 * 640
# 압축, 이미지를 64Color로 재표현
def recreate_image(codebook, labels, w, h): # 중심값팔레트, 중심값인덱스, 이미지크기
d = codebook.shape[1] # 2차원
image= np.zeros((w,h,d)) # 이미지 사이즈만큼 초기화
label_idx = 0
for i in range(w):
for j in range(h):
image[i][j] = codebook[labels[label_idx]]
label_idx += 1
return image
656 656 3 Kmeans 데이터 피팅 (k-means) 예측
plt.figure(1)
plt.clf()
ax = plt.axes([0,0,1,1])
plt.axis('off')
plt.title('Original 이미지 (96,615 colors)')
plt.imshow(image)
plt.figure(2)
plt.clf()
ax = plt.axes([0,0,1,1])
plt.axis('off')
plt.title('Vector Quantized(VQ) image (16 colors, K-Menas)')
plt.imshow(recreate_image(kmeans.cluster_centers_, labels, w, h))
<matplotlib.image.AxesImage at 0x22fb9beeac0>
C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:238: RuntimeWarning: Glyph 51060 missing from current font. font.set_text(s, 0.0, flags=flags) C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:238: RuntimeWarning: Glyph 48120 missing from current font. font.set_text(s, 0.0, flags=flags) C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:238: RuntimeWarning: Glyph 51648 missing from current font. font.set_text(s, 0.0, flags=flags) C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:201: RuntimeWarning: Glyph 51060 missing from current font. font.set_text(s, 0, flags=flags) C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:201: RuntimeWarning: Glyph 48120 missing from current font. font.set_text(s, 0, flags=flags) C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:201: RuntimeWarning: Glyph 51648 missing from current font. font.set_text(s, 0, flags=flags)
print(image.shape)
# kilo byte
print((image.shape[0] * image.shape[1] * image.shape[2])/1024, "kb")
(656, 656, 3) 1260.75 kb
x, y, z = image.shape
image = np.array(image, dtype = np.float64)/ 255
image_2d =image.reshape(x * y, z) # 컬러값들로만 표현
print(image_2d.shape)
(430336, 3)
kmeans_cluster =cluster.KMeans(n_clusters=16)
kmeans_cluster.fit(image_2d)
cluster_centers =kmeans_cluster.cluster_centers_
cluster_centers
array([[0.81521686, 0.65172505, 0.58347005],
[0.23335047, 0.13015071, 0.09727977],
[0.76118724, 0.59018885, 0.52382254],
[0.54119204, 0.37641395, 0.31796993],
[0.3817041 , 0.23126174, 0.18524984],
[0.93008311, 0.82807302, 0.78674661],
[0.12102418, 0.05318226, 0.02485256],
[0.86349607, 0.71565464, 0.64408491],
[0.70027473, 0.51969257, 0.45952031],
[0.18012572, 0.09035189, 0.05857787],
[0.45966165, 0.30348683, 0.25063392],
[0.62878855, 0.44828408, 0.39002871],
[0.30145963, 0.17775191, 0.13915939],
[0.89519583, 0.77041429, 0.71744371],
[0.64828172, 0.26867463, 0.26539219],
[0.78356185, 0.70167092, 0.6949171 ]])
cluster_labels = kmeans_cluster.labels_
print(cluster_labels) # 픽셀 개수 만큼
[15 15 15 ... 9 9 6]
print(len(cluster_labels))
print("압축사이즈=", ((cluster_centers.shape[0]*
cluster_centers.shape[1]) + len(cluster_labels))/ 1024)
plt.figure(figsize=(15,8))
plt.imshow(cluster_centers[cluster_labels].reshape(x,y,z))
430336 압축사이즈= 420.296875
<matplotlib.image.AxesImage at 0x22fba21c490>
import matplotlib
from matplotlib import font_manager, rc
import platform
if platform.system() == 'Windows':
# 윈도우인 경우
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)
else:
# Mac 인 경우
rc('font', family='AppleGothic')
matplotlib.rcParams['axes.unicode_minus'] = False
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
X, label = make_blobs(100, centers =1 )
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=1)
kmeans.fit(X)
f, ax = plt.subplots(figsize=(7,5))
ax.set_title('원형이상치제거')
ax.scatter(X[:,0], X[:,1], label='Points')
ax.scatter(kmeans.cluster_centers_[:,0],
kmeans.cluster_centers_[:,1], label='Centroid', color='r')
ax.legend(loc='best')
<matplotlib.legend.Legend at 0x2a0bef53d90>
C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:238: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0.0, flags=flags) C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:201: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0, flags=flags)
import matplotlib
matplotlib.font_manager._rebuild()
matplotlib.__version__
'3.3.2'
distances = kmeans.transform(X) # 데이터를 ceter point와의 거리값변경
sorted_idx =np.argsort(distances.ravel())[::-1][:5]
데이터, 인덱스 10, 8, 2, 인덱스
0 Z 일 2,1,4,3,0
1 B 이
2 A
3 F
4 C
f, ax =plt.subplots(figsize=(7,5))
ax.set_title('Single Cluster')
ax.scatter(X[:,0], X[:, 1], label='Points')
ax.scatter(kmeans.cluster_centers_[:, 0],
kmeans.cluster_centers_[:,1],
label='Centroid', color='r')
ax.scatter(X[sorted_idx][:,0],
X[sorted_idx][:,1],
label='이상치', edgecolors='g',
facecolors='none', s=100)
ax.legend(loc='best')
new_X = np.delete(X, sorted_idx, axis=0)
new_kmeans = KMeans(n_clusters=1)
new_kmeans.fit(new_X)
KMeans(n_clusters=1)
C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:238: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0.0, flags=flags) C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:201: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0, flags=flags)
f, ax =plt.subplots(figsize=(7,5))
ax.set_title('이상치 제거후')
ax.scatter(new_X[:,0], new_X[:, 1], label='Pruned Points')
ax.scatter(kmeans.cluster_centers_[:, 0],
kmeans.cluster_centers_[:,1],
label='이전중심', color='r', s=80, alpha=.5)
ax.scatter(new_kmeans.cluster_centers_[:,0],
new_kmeans.cluster_centers_[:,1],
label='이후중심', color='m', s=80, alpha=.5)
ax.legend(loc='best')
<matplotlib.legend.Legend at 0x2a0c44cea90>
import mglearn # 모델 시각화를 위해 생성모델
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering # 계층적 : 병합
from sklearn.datasets import make_blobs
X, y = make_blobs(random_state=1)
agg = AgglomerativeClustering(n_clusters=3)
assignment = agg.fit_predict(X) # 그룹번호를 할당
# 모델 출력을 위해서
mglearn.discrete_scatter(X[:,0],X[:,1], assignment)
# loc :location
plt.legend(["클러스터 0", "클러스터 1", "클러스터 2"], loc="best")
plt.xlabel("특성 0")
plt.ylabel("특성 1")
Text(0, 0.5, '특성 1')
C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:238: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0.0, flags=flags) C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:201: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0, flags=flags)
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
import mglearn
X, y = make_moons(n_samples=200, noise=0.05, random_state=0)
scaler = StandardScaler()
scaler.fit(X)
# clustering 에서도 정규화하면 좋다.
X_scaled = scaler.transform(X)
dbscan = DBSCAN() # scikits : eps, min_samples
clusters = dbscan.fit_predict(X_scaled)
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=clusters,
cmap=mglearn.cm2, s=60, edgecolors='black')
plt.xlabel("특성 0")
plt.ylabel("특성 1")
Text(0, 0.5, '특성 1')
C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:238: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0.0, flags=flags) C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:201: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0, flags=flags)
mglearn.discrete_scatter(X_scaled[:,0], X_scaled[:,1],clusters)
[<matplotlib.lines.Line2D at 0x2a0c53768b0>, <matplotlib.lines.Line2D at 0x2a0c5376bb0>]
C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:238: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0.0, flags=flags) C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:201: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0, flags=flags)
mglearn.plots.plot_dbscan()
min_samples: 2 eps: 1.000000 cluster: [-1 0 0 -1 0 -1 1 1 0 1 -1 -1] min_samples: 2 eps: 1.500000 cluster: [0 1 1 1 1 0 2 2 1 2 2 0] min_samples: 2 eps: 2.000000 cluster: [0 1 1 1 1 0 0 0 1 0 0 0] min_samples: 2 eps: 3.000000 cluster: [0 0 0 0 0 0 0 0 0 0 0 0] min_samples: 3 eps: 1.000000 cluster: [-1 0 0 -1 0 -1 1 1 0 1 -1 -1] min_samples: 3 eps: 1.500000 cluster: [0 1 1 1 1 0 2 2 1 2 2 0] min_samples: 3 eps: 2.000000 cluster: [0 1 1 1 1 0 0 0 1 0 0 0] min_samples: 3 eps: 3.000000 cluster: [0 0 0 0 0 0 0 0 0 0 0 0] min_samples: 5 eps: 1.000000 cluster: [-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1] min_samples: 5 eps: 1.500000 cluster: [-1 0 0 0 0 -1 -1 -1 0 -1 -1 -1] min_samples: 5 eps: 2.000000 cluster: [-1 0 0 0 0 -1 -1 -1 0 -1 -1 -1] min_samples: 5 eps: 3.000000 cluster: [0 0 0 0 0 0 0 0 0 0 0 0]
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
# #############################################################################
# Generate sample data
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,
random_state=0)
X = StandardScaler().fit_transform(X)
# #############################################################################
# Compute DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f"
% metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f"
% metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.3f"
% metrics.silhouette_score(X, labels))
# #############################################################################
# Plot result
import matplotlib.pyplot as plt
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]
class_member_mask = (labels == k)
xy = X[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=14)
xy = X[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=6)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
Estimated number of clusters: 3 Estimated number of noise points: 18 Homogeneity: 0.953 Completeness: 0.883 V-measure: 0.917 Adjusted Rand Index: 0.952 Adjusted Mutual Information: 0.916 Silhouette Coefficient: 0.626
C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:238: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0.0, flags=flags) C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:201: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0, flags=flags)
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
import mglearn
centers = [[1,1], [-1, -1], [1, -1]]
X , labels_true = make_blobs(n_samples=750, centers=centers,
cluster_std=0.4, random_state=0)
scaler =StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
bscan = DBSCAN(eps=0.3, min_samples=10).fit(X)
clusters = dbscan.fit_predict(X_scaled)
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=clusters,
cmap=mglearn.cm2, s=60, edgecolors='black')
plt.xlabel("특성 0")
plt.ylabel("특성 1")
Text(0, 0.5, '특성 1')
C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:238: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0.0, flags=flags) C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:201: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0, flags=flags)
mglearn.discrete_scatter(X_scaled[:,0], X_scaled[:,1],clusters)
[<matplotlib.lines.Line2D at 0x2a0c5368670>]
C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:238: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0.0, flags=flags) C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:201: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0, flags=flags)
mglearn.plots.plot_dbscan()
min_samples: 2 eps: 1.000000 cluster: [-1 0 0 -1 0 -1 1 1 0 1 -1 -1] min_samples: 2 eps: 1.500000 cluster: [0 1 1 1 1 0 2 2 1 2 2 0] min_samples: 2 eps: 2.000000 cluster: [0 1 1 1 1 0 0 0 1 0 0 0] min_samples: 2 eps: 3.000000 cluster: [0 0 0 0 0 0 0 0 0 0 0 0] min_samples: 3 eps: 1.000000 cluster: [-1 0 0 -1 0 -1 1 1 0 1 -1 -1] min_samples: 3 eps: 1.500000 cluster: [0 1 1 1 1 0 2 2 1 2 2 0] min_samples: 3 eps: 2.000000 cluster: [0 1 1 1 1 0 0 0 1 0 0 0] min_samples: 3 eps: 3.000000 cluster: [0 0 0 0 0 0 0 0 0 0 0 0] min_samples: 5 eps: 1.000000 cluster: [-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1] min_samples: 5 eps: 1.500000 cluster: [-1 0 0 0 0 -1 -1 -1 0 -1 -1 -1] min_samples: 5 eps: 2.000000 cluster: [-1 0 0 0 0 -1 -1 -1 0 -1 -1 -1] min_samples: 5 eps: 3.000000 cluster: [0 0 0 0 0 0 0 0 0 0 0 0]
Import Modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tslearn.clustering import TimeSeriesKMeans
C:\Users\1\anaconda3\lib\site-packages\tslearn\clustering\kmeans.py:16: UserWarning: Scikit-learn <0.24 will be deprecated in a future release of tslearn warnings.warn(
Load Data
df = pd.read_csv('ex_eample.csv', index_col=0)
df.head()
| t0 | t1 | t2 | t3 | t4 | t5 | t6 | |
|---|---|---|---|---|---|---|---|
| idx | |||||||
| 0 | 12 | 27 | 30 | 48 | 59 | 60 | 76 |
| 1 | 11 | 30 | 31 | 44 | 58 | 66 | 70 |
| 2 | 16 | 25 | 36 | 50 | 59 | 68 | 76 |
| 3 | 19 | 26 | 38 | 47 | 56 | 64 | 73 |
| 4 | 18 | 22 | 30 | 43 | 58 | 61 | 72 |
plt.plot(df.T)
plt.show()
Preprocessing
avg_list = []
std_list = []
for idx, row in df.iterrows():
avg_list.append(row.mean())
std_list.append(row.std())
df['avg'] = avg_list
df['std'] = std_list
df.head()
| t0 | t1 | t2 | t3 | t4 | t5 | t6 | avg | std | |
|---|---|---|---|---|---|---|---|---|---|
| idx | |||||||||
| 0 | 12 | 27 | 30 | 48 | 59 | 60 | 76 | 44.571429 | 22.463727 |
| 1 | 11 | 30 | 31 | 44 | 58 | 66 | 70 | 44.285714 | 21.638810 |
| 2 | 16 | 25 | 36 | 50 | 59 | 68 | 76 | 47.142857 | 22.363874 |
| 3 | 19 | 26 | 38 | 47 | 56 | 64 | 73 | 46.142857 | 19.777332 |
| 4 | 18 | 22 | 30 | 43 | 58 | 61 | 72 | 43.428571 | 20.911378 |
for i in range(len(df)):
for j in range(0, 7):
df.iloc[i, j] = (df.iloc[i, j] - df.iloc[i, 7]) / df.iloc[i, 8]
df.head()
| t0 | t1 | t2 | t3 | t4 | t5 | t6 | avg | std | |
|---|---|---|---|---|---|---|---|---|---|
| idx | |||||||||
| 0 | -1.449957 | -0.782213 | -0.648665 | 0.152627 | 0.642305 | 0.686822 | 1.399081 | 44.571429 | 22.463727 |
| 1 | -1.538241 | -0.660189 | -0.613976 | -0.013204 | 0.633782 | 1.003488 | 1.188341 | 44.285714 | 21.638810 |
| 2 | -1.392552 | -0.990117 | -0.498253 | 0.127757 | 0.530192 | 0.932627 | 1.290346 | 47.142857 | 22.363874 |
| 3 | -1.372423 | -1.018482 | -0.411727 | 0.043340 | 0.498406 | 0.902910 | 1.357976 | 46.142857 | 19.777332 |
| 4 | -1.216016 | -1.024733 | -0.642166 | -0.020495 | 0.696818 | 0.840281 | 1.366310 | 43.428571 | 20.911378 |
df = df.drop(['avg', 'std'], axis=1)
df.head()
| t0 | t1 | t2 | t3 | t4 | t5 | t6 | |
|---|---|---|---|---|---|---|---|
| idx | |||||||
| 0 | -1.449957 | -0.782213 | -0.648665 | 0.152627 | 0.642305 | 0.686822 | 1.399081 |
| 1 | -1.538241 | -0.660189 | -0.613976 | -0.013204 | 0.633782 | 1.003488 | 1.188341 |
| 2 | -1.392552 | -0.990117 | -0.498253 | 0.127757 | 0.530192 | 0.932627 | 1.290346 |
| 3 | -1.372423 | -1.018482 | -0.411727 | 0.043340 | 0.498406 | 0.902910 | 1.357976 |
| 4 | -1.216016 | -1.024733 | -0.642166 | -0.020495 | 0.696818 | 0.840281 | 1.366310 |
Model1: Euclidean
model = TimeSeriesKMeans(n_clusters=3, metric='euclidean', random_state=121)
result = model.fit_predict(df)
result
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 1, 1, 2, 1, 1, 1, 2, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)
df['clst'] = result
df.head()
| t0 | t1 | t2 | t3 | t4 | t5 | t6 | clst | |
|---|---|---|---|---|---|---|---|---|
| idx | ||||||||
| 0 | -1.449957 | -0.782213 | -0.648665 | 0.152627 | 0.642305 | 0.686822 | 1.399081 | 0 |
| 1 | -1.538241 | -0.660189 | -0.613976 | -0.013204 | 0.633782 | 1.003488 | 1.188341 | 0 |
| 2 | -1.392552 | -0.990117 | -0.498253 | 0.127757 | 0.530192 | 0.932627 | 1.290346 | 0 |
| 3 | -1.372423 | -1.018482 | -0.411727 | 0.043340 | 0.498406 | 0.902910 | 1.357976 | 0 |
| 4 | -1.216016 | -1.024733 | -0.642166 | -0.020495 | 0.696818 | 0.840281 | 1.366310 | 0 |
clst0 = df[df['clst']==0]
clst0.drop(['clst'], axis=1, inplace=True)
plt.figure(figsize=(10, 4))
plt.plot(clst0.T)
plt.show()
C:\Users\1\anaconda3\lib\site-packages\pandas\core\frame.py:4308: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy return super().drop( C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:238: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0.0, flags=flags) C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:201: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0, flags=flags)
clst1 = df[df['clst']==1]
clst1.drop(['clst'], axis=1, inplace=True)
plt.figure(figsize=(10, 4))
plt.plot(clst1.T)
plt.show()
C:\Users\1\anaconda3\lib\site-packages\pandas\core\frame.py:4308: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy return super().drop( C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:238: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0.0, flags=flags) C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:201: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0, flags=flags)
clst2 = df[df['clst']==2]
clst2.drop(['clst'], axis=1, inplace=True)
plt.figure(figsize=(10, 4))
plt.plot(clst2.T)
plt.show()
C:\Users\1\anaconda3\lib\site-packages\pandas\core\frame.py:4308: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy return super().drop( C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:238: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0.0, flags=flags) C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:201: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0, flags=flags)
Model2: DTW
model2 = TimeSeriesKMeans(n_clusters=3, metric='dtw', random_state=121)
result2 = model2.fit_predict(df)
result2
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 1, 1, 1, 2, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)
df['clst'] = result2
df.head()
| t0 | t1 | t2 | t3 | t4 | t5 | t6 | clst | |
|---|---|---|---|---|---|---|---|---|
| idx | ||||||||
| 0 | -1.449957 | -0.782213 | -0.648665 | 0.152627 | 0.642305 | 0.686822 | 1.399081 | 0 |
| 1 | -1.538241 | -0.660189 | -0.613976 | -0.013204 | 0.633782 | 1.003488 | 1.188341 | 0 |
| 2 | -1.392552 | -0.990117 | -0.498253 | 0.127757 | 0.530192 | 0.932627 | 1.290346 | 0 |
| 3 | -1.372423 | -1.018482 | -0.411727 | 0.043340 | 0.498406 | 0.902910 | 1.357976 | 0 |
| 4 | -1.216016 | -1.024733 | -0.642166 | -0.020495 | 0.696818 | 0.840281 | 1.366310 | 0 |
clst0 = df[df['clst']==0]
clst0.drop(['clst'], axis=1, inplace=True)
plt.figure(figsize=(10, 4))
plt.plot(clst0.T)
plt.show()
C:\Users\1\anaconda3\lib\site-packages\pandas\core\frame.py:4308: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy return super().drop( C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:238: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0.0, flags=flags) C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:201: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0, flags=flags)
clst1 = df[df['clst']==1]
clst1.drop(['clst'], axis=1, inplace=True)
plt.figure(figsize=(10, 4))
plt.plot(clst1.T)
plt.show()
C:\Users\1\anaconda3\lib\site-packages\pandas\core\frame.py:4308: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy return super().drop( C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:238: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0.0, flags=flags) C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:201: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0, flags=flags)
clst2 = df[df['clst']==2]
clst2.drop(['clst'], axis=1, inplace=True)
plt.figure(figsize=(10, 4))
plt.plot(clst2.T)
plt.show()
C:\Users\1\anaconda3\lib\site-packages\pandas\core\frame.py:4308: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy return super().drop( C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:238: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0.0, flags=flags) C:\Users\1\anaconda3\lib\site-packages\matplotlib\backends\backend_agg.py:201: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0, flags=flags)